In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,LogisticRegression
from sklearn.metrics import r2_score , mean_squared_error ,confusion_matrix,accuracy_score
import string as st
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
import matplotlib as mpl
pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',50)
df = pd.read_csv("/Users/niveditaj/Downloads/GamingStudy_data 2.csv", encoding = 'ISO-8859-1')
df.head()
df.drop(['S. No.' , 'Timestamp'] , axis = 1 , inplace = True)
df.describe().T
df.dtypes
df.info()
df.isnull().sum()
for i in df:
    print('-------------')
    print(df[i].value_counts().head(15))
    print('-------------')
for i in df:
    print('------',i,'------')
    print(df[i].unique()[:30])
    print('-------------')
for i in df:
    print('------',i,'------')
    print(df[i].nunique())
    print('-------------')
df.shape
df['Hours_streams'] = df['Hours'] + df['streams']
df.drop(  ((df[df['Hours_streams'] > 115].index) | (df[df['Hours_streams']==0].index)),
                                             axis=0,inplace=True)
df['Hours_streams'].value_counts()
df.GADE.value_counts()
df.GADE.fillna(df.GADE.value_counts().index[1] , inplace=True) #1
df.GADE.value_counts()
df.shape
df.streams.fillna(int(df.streams.mean()) , inplace = True)
df.Hours.fillna(int(df.Hours.mean()) , inplace = True)
df.drop('Hours_streams' , axis = 1 , inplace = True)
print(df.League.nunique())
df.League = df.League.str.lower().str.strip()
print(df.League.nunique())
df["League"].value_counts().head(50)
df["League"] =df["League"].str.extract(r'^([a-z]+)')
df.League.nunique()
df.League.unique()
df.loc[(df['whyplay']== 'having fun') ,'League'] =df.loc[(df['whyplay']== 'having fun') ,'League'].fillna('unranked')
df.League.fillna('gold' , inplace = True)
df.League.value_counts()
counts = df['League'].value_counts()
df['League'] = df['League'][~df['League'].isin(counts[counts < 3].index)]
df['League'] = df.League.replace(['i' , 'currently' , 'high' , 'season' , 'lol','cs' ,
                                  'last' ,'csgo','starcraft' ,'geater' , 'in', 'rank' , 'still'] , np.nan)
df.League.fillna('unspecified' , inplace=True)
df.League.unique()
df.League.value_counts()
df['Narcissism'].value_counts()
df.drop(["Birthplace","Birthplace_ISO3"],axis=1,inplace=True)
df['Residence'] = df['Residence'].replace('Unknown',df['Residence'].mode()[0])
df['Reference'].fillna('Other',inplace=True)
df.drop(df[df['accept'].isnull()].index , axis=0 , inplace=True)
df['Residence_ISO3'].fillna('USA',inplace=True) #11063
df.loc[11063,'Residence_ISO3'] = 'XXK'
col = ['SPIN1','SPIN2','SPIN3','SPIN4','SPIN5','SPIN6','SPIN7','SPIN8','SPIN9',
     'SPIN10','SPIN11','SPIN12','SPIN13','SPIN14','SPIN15','SPIN16','SPIN17' ,'SPIN_T']
for i in col :
    df[i].fillna(df[i].mode()[0], inplace = True)
df['Playstyle'] = df['Playstyle'].apply(lambda x: ' '.join(word.strip(st.punctuation) for word in x.split()))
df['earnings'] = df['earnings'].apply(lambda x: ' '.join(word.strip(st.punctuation) for word in x.split()))
df['whyplay'] = df['whyplay'].apply(lambda x: ' '.join(word.strip(st.punctuation) for word in x.split()))
df['Playstyle'] = df['Playstyle'].str.lower().str.strip()
df['whyplay'] = df['whyplay'].str.lower().str.strip()
df['earnings'] = df['earnings'].str.lower().str.strip()
df['Playstyle'].nunique()
df.drop('highestleague' , axis = 1 , inplace = True)
df.head(5)
df.isnull().sum()
df.Work.fillna(df.Work.mode()[0] , inplace=True)
df.drop(['Residence' , 'accept'] , axis = 1 , inplace = True)
df.dtypes
df.earnings.replace(df.earnings.value_counts().index[3:] , 'Other',inplace=True)
df['earnings'].value_counts()
df.whyplay.replace(df.whyplay.value_counts().index[5:] , 'Other',inplace=True)
df['whyplay'].value_counts()
df.Playstyle.replace(df.Playstyle.value_counts().index[5:] , 'Other',inplace=True)
df['Playstyle'].value_counts()
df.Playstyle.replace('Other' , np.nan , inplace=True)
df.whyplay.replace('Other' , np.nan , inplace=True)
df.earnings.replace('Other' , np.nan , inplace=True)
df.isnull().sum()
df.dropna(inplace=True)
df.shape
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13464 entries, 0 to 13463
Data columns (total 53 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GAD1             13464 non-null  int64  
 1   GAD2             13464 non-null  int64  
 2   GAD3             13464 non-null  int64  
 3   GAD4             13464 non-null  int64  
 4   GAD5             13464 non-null  int64  
 5   GAD6             13464 non-null  int64  
 6   GAD7             13464 non-null  int64  
 7   GADE             12815 non-null  object 
 8   SWL1             13464 non-null  int64  
 9   SWL2             13464 non-null  int64  
 10  SWL3             13464 non-null  int64  
 11  SWL4             13464 non-null  int64  
 12  SWL5             13464 non-null  int64  
 13  Game             13464 non-null  object 
 14  Platform         13464 non-null  object 
 15  Hours            13434 non-null  float64
 16  earnings         13464 non-null  object 
 17  whyplay          13464 non-null  object 
 18  League           11626 non-null  object 
 19  highestleague    0 non-null      float64
 20  streams          13364 non-null  float64
 21  SPIN1            13340 non-null  float64
 22  SPIN2            13310 non-null  float64
 23  SPIN3            13324 non-null  float64
 24  SPIN4            13305 non-null  float64
 25  SPIN5            13298 non-null  float64
 26  SPIN6            13308 non-null  float64
 27  SPIN7            13326 non-null  float64
 28  SPIN8            13320 non-null  float64
 29  SPIN9            13306 non-null  float64
 30  SPIN10           13304 non-null  float64
 31  SPIN11           13277 non-null  float64
 32  SPIN12           13296 non-null  float64
 33  SPIN13           13277 non-null  float64
 34  SPIN14           13308 non-null  float64
 35  SPIN15           13317 non-null  float64
 36  SPIN16           13317 non-null  float64
 37  SPIN17           13289 non-null  float64
 38  Narcissism       13441 non-null  float64
 39  Gender           13464 non-null  object 
 40  Age              13464 non-null  int64  
 41  Work             13426 non-null  object 
 42  Degree           13464 non-null  object 
 43  Birthplace       13464 non-null  object 
 44  Residence        13464 non-null  object 
 45  Reference        13449 non-null  object 
 46  Playstyle        13464 non-null  object 
 47  accept           13050 non-null  object 
 48  GAD_T            13464 non-null  int64  
 49  SWL_T            13464 non-null  int64  
 50  SPIN_T           12814 non-null  float64
 51  Residence_ISO3   13354 non-null  object 
 52  Birthplace_ISO3  13343 non-null  object 
dtypes: float64(22), int64(15), object(16)
memory usage: 5.4+ MB
-------------
0    5694
1    5099
2    1520
3    1151
Name: GAD1, dtype: int64
-------------
-------------
0    7616
1    3576
2    1326
3     946
Name: GAD2, dtype: int64
-------------
-------------
0    5319
1    4708
2    2016
3    1421
Name: GAD3, dtype: int64
-------------
-------------
0    7129
1    3884
2    1488
3     963
Name: GAD4, dtype: int64
-------------
-------------
0    9236
1    2576
2     961
3     691
Name: GAD5, dtype: int64
-------------
-------------
0    5386
1    4993
2    1982
3    1103
Name: GAD6, dtype: int64
-------------
-------------
0    8444
1    2961
2    1211
3     848
Name: GAD7, dtype: int64
-------------
-------------
Not difficult at all    6236
Somewhat difficult      5148
Very difficult          1012
Extremely difficult      419
Name: GADE, dtype: int64
-------------
-------------
2    3043
5    2727
6    2218
3    2099
4    1625
1    1333
7     419
Name: SWL1, dtype: int64
-------------
-------------
6    3715
5    2869
4    1724
3    1660
2    1447
7    1418
1     631
Name: SWL2, dtype: int64
-------------
-------------
6    3417
5    2589
3    1785
2    1756
4    1626
7    1233
1    1058
Name: SWL3, dtype: int64
-------------
-------------
2    2738
5    2396
6    2068
3    1992
4    1864
1    1581
7     825
Name: SWL4, dtype: int64
-------------
-------------
2    2878
1    2740
3    2316
5    1690
6    1557
4    1331
7     952
Name: SWL5, dtype: int64
-------------
-------------
League of Legends      11314
Other                   1021
Starcraft 2              345
Counter Strike           318
World of Warcraft        152
Hearthstone              101
Diablo 3                  89
Heroes of the Storm       41
Guild Wars 2              37
Skyrim                    28
Destiny                   18
Name: Game, dtype: int64
-------------
-------------
PC                         13219
Console (PS, Xbox, ...)      222
Smartphone / Tablet           23
Name: Platform, dtype: int64
-------------
-------------
20.0    2293
10.0    1375
30.0    1186
15.0    1115
25.0     831
40.0     598
14.0     549
12.0     492
35.0     468
21.0     434
8.0      386
5.0      380
18.0     322
16.0     293
50.0     289
Name: Hours, dtype: int64
-------------
-------------
I play for fun                                                                               12112
I play mostly for fun but earn a little on the side (tournament winnings, streaming, etc)      990
I earn a living by playing this game                                                            48
I play to win                                                                                    3
i try                                                                                            2
I play but don't have fun...                                                                     1
Fun and routine                                                                                  1
I don't earn jack ****                                                                           1
I play for relaxation and it gives me a few minutes to release everything                        1
making money from boosting,but mostly fun                                                        1
I play because it's installed                                                                    1
competitive drive                                                                                1
I play because it is the only way i have to show i'm good at something                           1
I play to win and for improvement and want to earn a living playing this game                    1
for my ear                                                                                       1
Name: earnings, dtype: int64
-------------
-------------
having fun                  5289
improving                   4927
winning                     2091
relaxing                     654
All of the above              26
all of the above              22
Improving and having fun      10
improving and having fun       8
winning and improving          6
having fun and improving       5
distraction                    4
Wasting time                   4
All                            3
relaxing and having fun        3
winning while having fun       3
Name: whyplay, dtype: int64
-------------
-------------
Gold         970
Silver       650
Platinum     635
Diamond      549
gold         316
Unranked     264
Diamond 5    217
silver       212
Gold V       205
Gold 1       202
Gold 3       197
Silver 1     197
Gold 5       194
Silver 2     179
Gold 2       171
Name: League, dtype: int64
-------------
-------------
Series([], Name: highestleague, dtype: int64)
-------------
-------------
10.0    2181
5.0     1670
20.0    1182
2.0      990
15.0     953
3.0      832
0.0      826
4.0      714
8.0      576
1.0      510
6.0      505
30.0     399
7.0      325
12.0     300
25.0     242
Name: streams, dtype: int64
-------------
-------------
0.0    5677
1.0    4665
2.0    2363
3.0     500
4.0     135
Name: SPIN1, dtype: int64
-------------
-------------
0.0    4579
1.0    3883
2.0    2793
3.0    1571
4.0     484
Name: SPIN2, dtype: int64
-------------
-------------
0.0    5337
1.0    3863
2.0    2285
3.0    1194
4.0     645
Name: SPIN3, dtype: int64
-------------
-------------
1.0    3672
2.0    3192
0.0    2692
3.0    2660
4.0    1089
Name: SPIN4, dtype: int64
-------------
-------------
0.0    4677
1.0    3853
2.0    2333
3.0    1650
4.0     785
Name: SPIN5, dtype: int64
-------------
-------------
0.0    4463
1.0    3659
2.0    2277
3.0    1906
4.0    1003
Name: SPIN6, dtype: int64
-------------
-------------
0.0    6113
1.0    3218
2.0    1916
3.0    1367
4.0     712
Name: SPIN7, dtype: int64
-------------
-------------
0.0    4768
1.0    3101
2.0    2204
3.0    1782
4.0    1465
Name: SPIN8, dtype: int64
-------------
-------------
0.0    4009
1.0    3158
2.0    2393
3.0    2073
4.0    1673
Name: SPIN9, dtype: int64
-------------
-------------
0.0    6412
1.0    3647
2.0    1710
3.0     997
4.0     538
Name: SPIN10, dtype: int64
-------------
-------------
0.0    3574
1.0    2730
4.0    2596
3.0    2395
2.0    1982
Name: SPIN11, dtype: int64
-------------
-------------
0.0    6871
1.0    3196
2.0    1654
3.0     942
4.0     633
Name: SPIN12, dtype: int64
-------------
-------------
0.0    9146
1.0    2096
2.0    1304
3.0     474
4.0     257
Name: SPIN13, dtype: int64
-------------
-------------
0.0    4501
1.0    4127
2.0    2264
3.0    1652
4.0     764
Name: SPIN14, dtype: int64
-------------
-------------
0.0    4557
1.0    3351
2.0    2128
3.0    1940
4.0    1341
Name: SPIN15, dtype: int64
-------------
-------------
0.0    8281
1.0    2911
2.0    1280
3.0     586
4.0     259
Name: SPIN16, dtype: int64
-------------
-------------
0.0    6738
1.0    3038
2.0    1738
3.0    1176
4.0     599
Name: SPIN17, dtype: int64
-------------
-------------
1.0    5259
2.0    4360
3.0    2341
4.0    1153
5.0     328
Name: Narcissism, dtype: int64
-------------
-------------
Male      12699
Female      713
Other        52
Name: Gender, dtype: int64
-------------
-------------
18    3588
19    2159
20    1855
21    1507
22    1135
23     872
24     660
25     514
26     338
27     231
28     157
29     119
30      89
31      58
32      53
Name: Age, dtype: int64
-------------
-------------
Student at college / university    7073
Employed                           2734
Student at school                  2229
Unemployed / between jobs          1390
Name: Work, dtype: int64
-------------
-------------
High school diploma (or equivalent)    8560
Bachelor (or equivalent)               2687
None                                   1577
Master (or equivalent)                  546
Ph.D., Psy. D., MD (or equivalent)       94
Name: Degree, dtype: int64
-------------
-------------
USA            4261
Germany        1376
UK              940
Canada          842
Netherlands     503
France          411
Sweden          364
Poland          281
Brazil          260
Portugal        241
Denmark         240
Australia       232
Spain           196
Finland         195
Norway          186
Name: Birthplace, dtype: int64
-------------
-------------
USA            4569
Germany        1413
UK             1032
Canada          994
Netherlands     506
France          391
Sweden          379
Denmark         254
Brazil          253
Australia       245
Poland          243
Portugal        229
Spain           203
Norway          193
Belgium         189
Name: Residence, dtype: int64
-------------
-------------
Reddit            13324
Other                67
TeamLiquid.net       55
CrowdFlower           3
Name: Reference, dtype: int64
-------------
-------------
Multiplayer - online - with real life friends                    5564
Multiplayer - online - with strangers                            4134
Multiplayer - online - with online acquaintances or teammates    2652
Singleplayer                                                      762
Multiplayer - offline (people in the same room)                    48
all of the above                                                    7
All of the above                                                    6
Depends                                                             1
Multiplayer-online-real life friends and online acquaintances       1
Equal mix of real life friends and online                           1
multi-online-with strangers & real life friends                     1
MP with strangers and teammates are equal                           1
Multiplayer - online - with gf                                      1
Its pretty 33%: SP(KSP), MP friends(CS), MP strangers(LoL))         1
Duo Queue with one good friend, else solo                           1
Name: Playstyle, dtype: int64
-------------
-------------
Accept    13050
Name: accept, dtype: int64
-------------
-------------
0     1669
2     1618
1     1485
3     1455
4     1219
5     1024
6      849
7      720
8      598
9      475
10     407
11     367
12     308
13     292
14     239
Name: GAD_T, dtype: int64
-------------
-------------
24    636
20    607
18    606
22    600
21    595
15    590
19    588
23    587
17    580
16    576
25    569
26    568
14    550
27    540
28    525
Name: SWL_T, dtype: int64
-------------
-------------
14.0    444
11.0    443
12.0    440
7.0     426
13.0    413
8.0     409
10.0    408
9.0     401
17.0    395
15.0    388
16.0    380
5.0     357
19.0    351
18.0    351
6.0     349
Name: SPIN_T, dtype: int64
-------------
-------------
USA    4569
DEU    1413
GBR    1032
CAN     994
NLD     506
FRA     391
SWE     379
DNK     254
BRA     253
AUS     245
POL     243
PRT     229
ESP     203
NOR     193
BEL     189
Name: Residence_ISO3, dtype: int64
-------------
-------------
USA    4261
DEU    1376
GBR     940
CAN     842
NLD     503
FRA     411
SWE     364
POL     281
BRA     260
PRT     241
DNK     240
AUS     232
ESP     196
FIN     195
NOR     186
Name: Birthplace_ISO3, dtype: int64
-------------
------ GAD1 ------
[0 1 2 3]
-------------
------ GAD2 ------
[0 2 1 3]
-------------
------ GAD3 ------
[0 2 3 1]
-------------
------ GAD4 ------
[0 2 3 1]
-------------
------ GAD5 ------
[1 0 2 3]
-------------
------ GAD6 ------
[0 1 3 2]
-------------
------ GAD7 ------
[0 1 2 3]
-------------
------ GADE ------
['Not difficult at all' 'Somewhat difficult' 'Very difficult' nan
 'Extremely difficult']
-------------
------ SWL1 ------
[3 2 5 7 6 1 4]
-------------
------ SWL2 ------
[5 6 2 4 3 7 1]
-------------
------ SWL3 ------
[5 2 4 3 6 7 1]
-------------
------ SWL4 ------
[5 2 3 6 1 7 4]
-------------
------ SWL5 ------
[5 1 2 3 7 6 4]
-------------
------ Game ------
['Skyrim' 'Other' 'World of Warcraft' 'League of Legends' 'Starcraft 2'
 'Counter Strike' 'Destiny' 'Diablo 3' 'Heroes of the Storm' 'Hearthstone'
 'Guild Wars 2']
-------------
------ Platform ------
['Console (PS, Xbox, ...)' 'PC' 'Smartphone / Tablet']
-------------
------ Hours ------
[15.  8.  0. 20.  4. 30. 25.  2. 14. 10.  6.  7. 60. 21.  5. 40.  9. 18.
 12. nan 11.  1. 45. 16. 50. 28. 42. 80. 17. 13.]
-------------
------ earnings ------
['I play for fun'
 'I play mostly for fun but earn a little on the side (tournament winnings, streaming, etc)'
 'for fun and to get better'
 'I intend to do both if I become good enough at some point.'
 'for fun and improvement'
 'i play for fun atm, but i pretend to earn a little by streaming.'
 'I earn a living by playing this game' 'Escapism'
 'I play to try and get to the point of making money.'
 'I play to escape life' 'Want to be a shoutcaster'
 'I play to forget about real life' 'Play for fun and to get better at it'
 'I play for the competitive aspect'
 'play for fun, would like to earn money for it'
 'wants money but dont get it.'
 'I play with unrealistic dreams of making money' 'Eloboosting.'
 "I think I might be addicted in the way that I think playing will be fun, but when I play I realize it's not."
 "I love it. I play for fun, but League is so much more than just the fun of playing it. The community, Reddit, LCS, everything. It's just so big and beautiful."
 'boosting idiots because they suck and i can make living from it.'
 'Eloboosting service, Yes' 'want to play for money'
 'I play for the competitive edge. I enjoy competition.'
 "I fucking hate this game so much, I only play it because its the only game I'm good at, and my friends play."
 'I play as a part in socializing with friends far away'
 'I play to suppress memories/feelings'
 'I play for fun hoping to one day make money from it' 'Boredom'
 "I'm trying to get money for playing it"]
-------------
------ whyplay ------
['having fun' 'improving' 'relaxing' 'winning' 'improving, having fun'
 'All'
 "I play it as I watch TV or movies.  I've gone through many a Netflix binge with Isaac."
 'Reaching goal i.e. GM '
 'Improving AND having fun (kinda wish I could pick more than one)'
 'all of the above' 'passing the time' 'having fun and improving'
 'Forgetting troubles' 'All of them' 'Being with friends'
 'getting good loot' 'have fun and win' 'Winning 55% improving 45%'
 'Wasting time' 'talking to irl friends'
 'Forgetting about my drug addiction'
 'Having fun and improving at the same time' 'Improving and having fun.'
 'improving while having fun' 'All of the above!' 'socializing'
 'all above' 'playing well' 'winning + having fun' 'having a distraction']
-------------
------ League ------
[nan '-' 'Gold' 'none' 'na' 'Diamond/Master' 'Not Applicable.' 'Bronze'
 'None' 'Silver 4' 'N/a' 'Gold Nova' 'Unranked - Low ELO' 'Not applicable'
 '0' 'Silver 2' 'not applicable' "I don't know what that means" '/' 'bad'
 '_' 'Legendary Eagle at CS, no league yet at LoL' 'Global top 0.5%'
 'Bronze 5' 'potato' 'diamond' 'Diamond' 'Master' 'SilverII' 'Platinum 3']
-------------
------ highestleague ------
[nan]
-------------
------ streams ------
[ 0.  2.  5.  1.  8. 10.  4.  3. 20. 35. 18.  6. 25. 15. 50. 14. 40. 30.
 16. nan 12.  7. 80. 90.  9. 11. 41. 21. 42. 29.]
-------------
------ SPIN1 ------
[ 1.  2.  0.  3.  4. nan]
-------------
------ SPIN2 ------
[ 0.  1.  2.  3.  4. nan]
-------------
------ SPIN3 ------
[ 0.  1.  2.  4.  3. nan]
-------------
------ SPIN4 ------
[ 1.  3.  2.  0.  4. nan]
-------------
------ SPIN5 ------
[ 0.  2.  3.  1.  4. nan]
-------------
------ SPIN6 ------
[ 0.  3.  4.  1.  2. nan]
-------------
------ SPIN7 ------
[ 1.  2.  0.  4.  3. nan]
-------------
------ SPIN8 ------
[ 0.  3.  1.  4.  2. nan]
-------------
------ SPIN9 ------
[ 0.  4.  2.  3.  1. nan]
-------------
------ SPIN10 ------
[ 0.  1.  3.  2.  4. nan]
-------------
------ SPIN11 ------
[ 0.  3.  1.  4.  2. nan]
-------------
------ SPIN12 ------
[ 0.  3.  1.  4.  2. nan]
-------------
------ SPIN13 ------
[ 0.  1. nan  2.  3.  4.]
-------------
------ SPIN14 ------
[ 1.  3.  2.  0.  4. nan]
-------------
------ SPIN15 ------
[ 0.  3.  4.  1.  2. nan]
-------------
------ SPIN16 ------
[ 1.  4.  0.  3.  2. nan]
-------------
------ SPIN17 ------
[ 0.  2.  1.  3.  4. nan]
-------------
------ Narcissism ------
[ 1.  4.  2.  5.  3. nan]
-------------
------ Gender ------
['Male' 'Female' 'Other']
-------------
------ Age ------
[25 41 32 28 19 24 29 23 27 21 18 35 30 22 20 33 26 36 40 34 50 39 31 63
 38 37 42 49 44 56]
-------------
------ Work ------
['Unemployed / between jobs' 'Employed' 'Student at college / university'
 'Student at school' nan]
-------------
------ Degree ------
['Bachelor\xa0(or equivalent)' 'High school diploma (or equivalent)'
 'Ph.D., Psy. D., MD (or equivalent)' 'Master\xa0(or equivalent)' 'None']
-------------
------ Birthplace ------
['USA' 'Germany' 'Finland' 'Canada' 'Australia' 'UK'
 'Bosnia and Herzegovina' 'South Korea' 'Romania' 'Sweden' 'Greece'
 'Ireland' 'Switzerland' 'Turkey' 'Bulgaria' 'Belgium' 'Latvia' 'Austria'
 'South Africa' 'Croatia' 'India' 'Netherlands' 'Denmark' 'Portugal'
 'France' 'Argentina' 'Colombia' 'Estonia' 'Russia' 'Slovakia']
-------------
------ Residence ------
['USA' 'Germany' 'South Korea' 'Japan' 'Finland' 'Canada' 'Australia' 'UK'
 'Bosnia and Herzegovina' 'Ireland' 'Malaysia' 'Romania' 'Sweden' 'Greece'
 'Turkey' 'Belgium' 'Latvia' 'Austria' 'South Africa' 'Croatia'
 'Switzerland' 'India' 'Netherlands' 'Denmark' 'Portugal' 'France'
 'Argentina' 'Estonia' 'Russia' 'Czech Republic']
-------------
------ Reference ------
['Reddit' 'Other' 'TeamLiquid.net' 'CrowdFlower' nan]
-------------
------ Playstyle ------
['Singleplayer' 'Multiplayer - online - with strangers'
 'Multiplayer - online - with online acquaintances or teammates'
 'Multiplayer - online - with real life friends'
 'Multiplayer - offline (people in the same room)' 'all of the above'
 'Multiplayer - Online - With real friends, and with online acquaintances, and strangers'
 'With strangers and friends' 'Singleplayer - online' 'watching'
 ' Multiplayer - online - with online acquaintances or teammates and with real life friends'
 'Multiplayer - online with friends, teammates and strangers'
 'Sometimes alone (Ranked), sometimes with friends. Everytime there are strangers in my game ofc'
 'Multiplayer online, with friends (sometimes in the same room), sometimes with strangers.'
 'I prefer playing with RLF but I also play it to make a living.'
 'Multiplayer - online - with strangers or real life friends'
 'Multiplayer - online with online team mates and real life friends'
 'Mutliplayer - A mix of friends/strangers.'
 'Multiplayer - online - with real life friends and strangers'
 'Play solo but in skype call with friends'
 'mix of real life friends and online acquaintances'
 'A combination of all of the aboive'
 'With real life friends against strangers' "It's really a bit of all"
 'multiplayer online with online friends' 'With friends and strangers.'
 'multiplayer - online with real life friends + stragners'
 'sometimes with friends, sometimes with strangers :)'
 'Multiplayer with online friends and real life friends'
 'Multilayer - online - with real life friends and online friends as well as with strangers']
-------------
------ accept ------
['Accept' nan]
-------------
------ GAD_T ------
[ 1  8  0 14 12 10 19  3  2  4 15  5  6  7 13 11  9 18 16 21 17 20]
-------------
------ SWL_T ------
[23 16 17 14 12 13 27 33 31 26 28 15 19  6  9 35  8  7 24 25 21 11 22 20
 10 30 29 18  5 34]
-------------
------ SPIN_T ------
[ 5. 33. 31. 11. 13. 26. nan 55.  6.  3. 23. 29. 30. 36.  8. 37.  0. 47.
 20. 39. 15. 12. 14. 46. 60. 41. 24.  9.  1. 17.]
-------------
------ Residence_ISO3 ------
['USA' 'DEU' 'KOR' 'JPN' 'FIN' 'CAN' 'AUS' 'GBR' 'BIH' 'IRL' 'MYS' 'ROU'
 'SWE' 'GRC' 'TUR' 'BEL' 'LVA' 'AUT' 'ZAF' 'HRV' 'CHE' 'IND' 'NLD' 'DNK'
 'PRT' 'FRA' 'ARG' 'EST' 'RUS' 'CZE']
-------------
------ Birthplace_ISO3 ------
['USA' 'DEU' 'FIN' 'CAN' 'AUS' 'GBR' 'BIH' 'KOR' 'ROU' 'SWE' 'GRC' 'IRL'
 'CHE' 'TUR' 'BGR' 'BEL' 'LVA' 'AUT' 'ZAF' 'HRV' 'IND' 'NLD' 'DNK' 'PRT'
 'FRA' 'ARG' 'COL' 'EST' 'RUS' 'SVK']
-------------
------ GAD1 ------
4
-------------
------ GAD2 ------
4
-------------
------ GAD3 ------
4
-------------
------ GAD4 ------
4
-------------
------ GAD5 ------
4
-------------
------ GAD6 ------
4
-------------
------ GAD7 ------
4
-------------
------ GADE ------
4
-------------
------ SWL1 ------
7
-------------
------ SWL2 ------
7
-------------
------ SWL3 ------
7
-------------
------ SWL4 ------
7
-------------
------ SWL5 ------
7
-------------
------ Game ------
11
-------------
------ Platform ------
3
-------------
------ Hours ------
84
-------------
------ earnings ------
314
-------------
------ whyplay ------
407
-------------
------ League ------
1455
-------------
------ highestleague ------
0
-------------
------ streams ------
65
-------------
------ SPIN1 ------
5
-------------
------ SPIN2 ------
5
-------------
------ SPIN3 ------
5
-------------
------ SPIN4 ------
5
-------------
------ SPIN5 ------
5
-------------
------ SPIN6 ------
5
-------------
------ SPIN7 ------
5
-------------
------ SPIN8 ------
5
-------------
------ SPIN9 ------
5
-------------
------ SPIN10 ------
5
-------------
------ SPIN11 ------
5
-------------
------ SPIN12 ------
5
-------------
------ SPIN13 ------
5
-------------
------ SPIN14 ------
5
-------------
------ SPIN15 ------
5
-------------
------ SPIN16 ------
5
-------------
------ SPIN17 ------
5
-------------
------ Narcissism ------
5
-------------
------ Gender ------
3
-------------
------ Age ------
30
-------------
------ Work ------
4
-------------
------ Degree ------
5
-------------
------ Birthplace ------
126
-------------
------ Residence ------
109
-------------
------ Reference ------
4
-------------
------ Playstyle ------
298
-------------
------ accept ------
1
-------------
------ GAD_T ------
22
-------------
------ SWL_T ------
31
-------------
------ SPIN_T ------
69
-------------
------ Residence_ISO3 ------
107
-------------
------ Birthplace_ISO3 ------
124
-------------
1444
1199
Out[14]:
(12081, 48)
In [15]:
#NORMALISATION

from sklearn.preprocessing import MinMaxScaler
cols = ['Hours' , 'streams' ,'Age','GAD_T', 'SWL_T', 'SPIN_T']
sc = MinMaxScaler()
df[cols] = sc.fit_transform(df[cols])
In [16]:
#Boxplot

fig, ax = plt.subplots(6, 6, figsize=(20, 25))

# Flatten the axes array so that we can iterate through it
axes = ax.flatten()

# Loop through columns and plot them in subplots
for i, (col_name, col_data) in enumerate(df.select_dtypes(exclude=['object']).iteritems()):
    if i < len(axes):
        axes[i].boxplot(col_data, vert=False)
        axes[i].set_title(col_name)

fig.suptitle('Box plots')
plt.tight_layout()
plt.show()
In [17]:
df.drop( df[df['Age'] > 50].index , axis = 0 , inplace=True)

#Correlations

corr = df.corr(method='spearman')
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize = (35, 35))
cormat = sns.heatmap(corr, mask=mask, annot=True, cmap='YlGnBu', linewidths=1, fmt=".2f")
cormat.set_title('Correlation Matrix')
plt.show()
In [18]:
#Density plot

fig, ax = plt.subplots(3, 2, figsize=(15, 15))
axes = ax.flatten()
columns_to_plot = ['GAD_T', 'SWL_T', 'SPIN_T', 'Hours', 'streams', 'Narcissism']

for i, col_name in enumerate(columns_to_plot):
    if i < len(axes):
        sns.kdeplot(df[col_name], ax=axes[i], color='b')
        axes[i].set_title(col_name)

fig.suptitle('Density plots')
plt.tight_layout()
plt.show()
In [19]:
#Line Plot

fig, axes = plt.subplots(1, 3, figsize=(20, 5) )
fig.suptitle('Game vs Anxiety')
labels = ['SPIN_T', 'GAD_T' , 'SWL_T']

for count, ele in enumerate(labels):
    sns.lineplot(x=ele , y="Hours", data=df , ax = axes[count])
    axes[count].set_title(f"{ele} vs Gaming Hours")
fig, axes = plt.subplots(1, 3, figsize=(20, 5) )
for count, ele in enumerate(labels):
    sns.lineplot(x=ele , y="streams", data=df , ax = axes[count])
    axes[count].set_title(f"{ele} vs Watching Hours")

font_dict = {'weight': 'normal', 'size': 12}
mpl.rc('font', **font_dict)
def create_pie_chart(data, title, explode=None, ax=None):
    myexplode = explode if explode else [0] * len(data)
    data.plot(kind='pie', autopct='%1.1f%%', pctdistance=0.5, labeldistance=1, explode=myexplode, ax=ax)
    ax.set_title(title)

fig, axes = plt.subplots(1, 3, figsize=(23, 6))
fig.suptitle('Pie Charts')

create_pie_chart(df['Platform'].value_counts(), 'Platform used', explode=[0, 0.1, 0.2], ax=axes[0])
create_pie_chart(df['Playstyle'].value_counts().head(5), 'Playstyle', explode=[0, 0, 0, 0, 0.05], ax=axes[1])
create_pie_chart(df['GADE'].value_counts().head(5), 'General anxiety and life balance', explode=[0, 0, 0, 0.1], ax=axes[2])

plt.tight_layout()
plt.subplots_adjust(top=0.85)

plt.show()
In [20]:
#Count Plot

fig, axes = plt.subplots(1, 3, figsize=(20, 5) )
fig.suptitle('Game vs Anxiety')
labels = ['SPIN_T', 'GAD_T' , 'SWL_T','Narcissism']

for count, ele in enumerate(labels[:-1]):
    df.groupby('Game')[ele].mean().sort_values(ascending=False).plot(kind='bar' , ax = axes[count])
    axes[count].set_title(f"Game vs {ele}")
    
fig, axes = plt.subplots(1 , 4, figsize=(20, 5) )
fig.suptitle('Residence vs Anxiety')

for count, ele in enumerate(labels):
    df.groupby('Residence_ISO3')[ele].mean().head(10).sort_values(ascending=False).plot(kind='bar' , ax = axes[count])
    axes[count].set_title(f"Residence vs {ele}")

labels = ['Game', 'Residence_ISO3' , 'Gender','GADE' , 'Degree' , 'Work','Narcissism','Playstyle']
plt.figure(figsize=(9,14))
plt.suptitle('Gaming Hours')
for count, ele in enumerate(labels,1):
    plt.subplot(3 , 3 , count)
    plt.tight_layout()
    df.groupby(ele)['Hours'].mean().head(10).sort_values(ascending=False).plot(kind='bar')
    
plt.figure(figsize=(9,14))
plt.suptitle('Streams hours')
for count, ele in enumerate(labels,1):
    plt.subplot(3 , 3 , count)
    plt.tight_layout()
    df.groupby(ele)['streams'].mean().head(10).sort_values(ascending=False).plot(kind='bar')
plt.show()



plt.figure(figsize=(7,7))
df.groupby('Age')['SWL_T'].mean().plot()
plt.title("Age vs Satisfication with life")
plt.xlabel("Age")
plt.ylabel("Satisfication with life");

labels = ['Work', 'Degree' , 'Playstyle']
plt.figure(figsize=(15,6))
plt.suptitle('Satisifcation with life')
for count, ele in enumerate(labels,1):
    plt.subplot(1 , 4 , count)
    plt.tight_layout()
    df.groupby(ele)['SWL_T'].mean().head(10).sort_values(ascending=False).plot(kind='bar')
plt.show()

plt.figure(figsize=(5,5))
df.groupby('GAD_T')['SPIN_T'].mean().plot()
plt.title("GAD_T vs SPIN_T")
plt.xlabel("GAD_Total")
plt.ylabel("SPIN_Total")
plt.show()

plt.figure(figsize=(10,7))
df.groupby('League').mean()['Hours'].sort_values(ascending=False).plot(kind='bar')
plt.title("League vs Hours")
plt.xlabel("League")
plt.ylabel("Average Hours")
plt.show()
x=df.SPIN_T.mean()
y = df.SWL_T.mean()
z = df.GAD_T.mean()
c=[x,y,z]
plt.figure(figsize=(10,7))
plt.bar(['Social Phobia','Satisification with life','General Anxiety Disorder'],c,color ='maroon',
        width = 0.5)
plt.show()
In [21]:
#Label Encoding

le = LabelEncoder()
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = le.fit_transform(df[i])
df.head()
Out[21]:
GAD1 GAD2 GAD3 GAD4 GAD5 GAD6 GAD7 GADE SWL1 SWL2 SWL3 SWL4 SWL5 Game Platform Hours earnings whyplay League streams SPIN1 SPIN2 SPIN3 SPIN4 SPIN5 SPIN6 SPIN7 SPIN8 SPIN9 SPIN10 SPIN11 SPIN12 SPIN13 SPIN14 SPIN15 SPIN16 SPIN17 Narcissism Gender Age Work Degree Reference Playstyle GAD_T SWL_T SPIN_T Residence_ISO3
0 0 0 0 0 1 0 0 1 3 5 5 5 5 8 0 0.136364 1 1 48 0.00 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 1 0.155556 3 0 2 4 0.047619 0.600000 0.073529 102
1 1 2 2 2 0 1 0 2 3 5 2 5 1 7 1 0.072727 1 1 48 0.02 2.0 1.0 1.0 3.0 2.0 3.0 1.0 3.0 4.0 0.0 3.0 0.0 1.0 3.0 3.0 1.0 2.0 1.0 1 0.511111 3 0 2 3 0.380952 0.366667 0.485294 102
3 0 0 0 0 0 0 0 1 2 5 5 3 2 7 1 0.181818 1 2 11 0.05 2.0 0.0 0.0 1.0 3.0 0.0 0.0 0.0 0.0 0.0 1.0 3.0 0.0 0.0 1.0 0.0 0.0 2.0 1 0.222222 0 0 2 1 0.000000 0.400000 0.161765 102
4 2 1 2 2 2 3 2 3 2 2 4 5 1 7 0 0.181818 1 1 48 0.01 2.0 0.0 0.0 0.0 1.0 0.0 0.0 4.0 2.0 0.0 1.0 0.0 0.0 0.0 0.0 3.0 0.0 1.0 1 0.022222 0 1 2 3 0.666667 0.300000 0.191176 56
5 0 0 0 0 0 1 0 1 3 5 3 3 3 7 0 0.036364 1 3 11 0.00 1.0 1.0 2.0 3.0 2.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 2.0 1 0.133333 0 0 2 2 0.047619 0.400000 0.191176 102
In [22]:
#feature engineering
corr = df.corr(method='spearman')
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize = (35, 35))
cormat = sns.heatmap(corr, mask=mask, annot=True, cmap='YlGnBu', linewidths=1, fmt=".2f")
cormat.set_title('Correlation Matrix')
plt.show()
In [23]:
df1 = df[['GAD_T'  , 'SWL_T' , 'SPIN_T' ]]
df2 = df[[ 'Age' ,  'Hours' , 'streams' ]]
pc1 = PCA(n_components=2)
pc2 = PCA(n_components=2)
x1 = pc1.fit_transform(df1)
x2 = pc2.fit_transform(df2)
x = x1 + x2
In [25]:
model = KMeans(n_clusters = 5, init = "k-means++", max_iter = 300, n_init = 40, random_state = 0)
y_clusters = model.fit_predict(x)
In [26]:
pd.Series(y_clusters).value_counts().plot(kind='bar');
In [27]:
plt.figure(figsize=(13,13))

plt.scatter(x[y_clusters == 0, 0], x[y_clusters == 0, 1], s = 60, c = 'red', label = 'Cluster1')
plt.scatter(x[y_clusters == 1, 0], x[y_clusters == 1, 1], s = 60, c = 'blue', label = 'Cluster2')
plt.scatter(x[y_clusters == 2, 0], x[y_clusters == 2, 1], s = 60, c = 'green', label = 'Cluster3')
plt.scatter(x[y_clusters == 3, 0], x[y_clusters == 3, 1], s = 60, c = 'violet', label = 'Cluster4')
plt.scatter(x[y_clusters == 4, 0], x[y_clusters == 4, 1], s = 60, c = 'yellow', label = 'Cluster5') 

plt.legend()
plt.show()
In [29]:
#Adding label column to train our model for predicting in which group you are
df['Label'] = y_clusters
In [30]:
#Checking if our data is unbalanced
plt.rcParams.update({'font.size': 12})
df['Label'].value_counts()
Out[30]:
4    3245
1    3215
0    2360
2    1992
3    1269
Name: Label, dtype: int64
In [31]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
In [32]:
#Splitting data to train and test

X_train , X_test ,y_train , y_test = train_test_split(X,y,train_size=.8,random_state=44)
In [33]:
l1 = df[df['Label'] == 0]['GAD_T'].mean()
l2 = df[df['Label'] == 0]['SWL_T'].mean()
l3 = df[df['Label'] == 0]['SPIN_T'].mean()
l4 = df[df['Label'] == 0]['Hours'].mean()
l5 = df[df['Label'] == 0]['streams'].mean()

c=[l1,l2,l3,l4,l5]
plt.figure(figsize=(5,5))
plt.bar(['GAD_T','SWL_T','SPIN_T' ,'Hours' , 'streams'],c,color ='maroon',width = 0.5)
plt.show()
In [34]:
l1 = df[df['Label'] == 1]['GAD_T'].mean()
l2 = df[df['Label'] == 1]['SWL_T'].mean()
l3 = df[df['Label'] == 1]['SPIN_T'].mean()
l4 = df[df['Label'] == 1]['Hours'].mean()
l5 = df[df['Label'] == 1]['streams'].mean()

c=[l1,l2,l3,l4,l5]
plt.figure(figsize=(5,5))
plt.bar(['GAD_T','SWL_T','SPIN_T' ,'Hours' , 'streams'],c,color ='maroon',width = 0.5)
plt.show()
In [35]:
l1 = df[df['Label'] == 2]['GAD_T'].mean()
l2 = df[df['Label'] == 2]['SWL_T'].mean()
l3 = df[df['Label'] == 2]['SPIN_T'].mean()
l4 = df[df['Label'] == 2]['Hours'].mean()
l5 = df[df['Label'] == 2]['streams'].mean()

c=[l1,l2,l3,l4,l5]
plt.figure(figsize=(5,5))
plt.bar(['GAD_T','SWL_T','SPIN_T' ,'Hours' , 'streams'],c,color ='maroon',width = 0.5)
plt.show()
In [36]:
l1 = df[df['Label'] == 3]['GAD_T'].mean()
l2 = df[df['Label'] == 3]['SWL_T'].mean()
l3 = df[df['Label'] == 3]['SPIN_T'].mean()
l4 = df[df['Label'] == 3]['Hours'].mean()
l5 = df[df['Label'] == 3]['streams'].mean()

c=[l1,l2,l3,l4,l5]
plt.figure(figsize=(5,5))
plt.bar(['GAD_T','SWL_T','SPIN_T' ,'Hours' , 'streams'],c,color ='maroon',width = 0.5)
plt.show()
In [37]:
l1 = df[df['Label'] == 4]['GAD_T'].mean()
l2 = df[df['Label'] == 4]['SWL_T'].mean()
l3 = df[df['Label'] == 4]['SPIN_T'].mean()
l4 = df[df['Label'] == 4]['Hours'].mean()
l5 = df[df['Label'] == 4]['streams'].mean()

c=[l1,l2,l3,l4,l5]
plt.figure(figsize=(5,5))
plt.bar(['GAD_T','SWL_T','SPIN_T' ,'Hours' , 'streams'],c,color ='maroon',width = 0.5)
plt.show()
In [38]:
#Random forest classification modelling
clf = RandomForestClassifier(n_estimators = 1000 , random_state=44, max_depth=8) 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test,y_test)
Out[38]:
0.8200248241621845
In [39]:
clf.score(X_train,y_train)
Out[39]:
0.8713783112582781
In [40]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_pred,y_test))
              precision    recall  f1-score   support

           0       0.81      0.80      0.80       497
           1       0.89      0.86      0.87       683
           2       0.72      0.77      0.74       332
           3       0.77      0.95      0.85       216
           4       0.84      0.78      0.81       689

    accuracy                           0.82      2417
   macro avg       0.80      0.83      0.82      2417
weighted avg       0.82      0.82      0.82      2417

In [41]:
confusion_matrix(y_pred,y_test)
Out[41]:
array([[396,   0,  30,  40,  31],
       [  3, 586,  36,   0,  58],
       [ 18,  18, 256,  23,  17],
       [  7,   0,   3, 206,   0],
       [ 66,  53,  32,   0, 538]])
In [42]:
answer = clf.predict(X_test.iloc[30:45,:])
print(list(answer))
print(y_test.iloc[30:45].to_list())
[1, 1, 0, 4, 4, 4, 1, 4, 2, 3, 4, 0, 0, 1, 0]
[1, 1, 2, 4, 0, 4, 1, 0, 2, 3, 1, 4, 0, 1, 0]
In [43]:
lc = LogisticRegression(max_iter=3250 , n_jobs=-1 )
lc.fit(X_train,y_train)
lc_pred = lc.predict(X_test)
accuracy_score(lc_pred,y_test)
/Users/niveditaj/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[43]:
0.9304923458833264
In [44]:
print(confusion_matrix(lc_pred,y_test))
[[447   0   7  13  15]
 [  0 619  10   0  20]
 [  3   8 334   8   8]
 [ 12   0   4 248   0]
 [ 28  30   2   0 601]]
In [45]:
lc.score(X_train,y_train)
Out[45]:
0.9402938741721855
In [46]:
lc.score(X_test,y_test)
Out[46]:
0.9304923458833264
In [48]:
pip install xgboost
Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 21.0 MB/s eta 0:00:00a 0:00:01
Requirement already satisfied: numpy in /Users/niveditaj/opt/anaconda3/lib/python3.9/site-packages (from xgboost) (1.22.4)
Requirement already satisfied: scipy in /Users/niveditaj/opt/anaconda3/lib/python3.9/site-packages (from xgboost) (1.7.3)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.
In [49]:
from xgboost import XGBClassifier

xg = XGBClassifier(n_estimators=200, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xg.fit(X_train,y_train)
xgpred = xg.predict(X_test)
accuracy_score(xgpred,y_test)
Out[49]:
0.9429044269755896
In [50]:
xg.score(X_test,y_test)
Out[50]:
0.9429044269755896
In [51]:
# Export the cleaned dataset to a CSV file
df.to_csv("cleaned_dataset.csv", index=False)
In [ ]: